1 Setup

Set the working dir

setwd("/mnt/picea/projects/arabidopsis/mschmid/porcupine-RNA-Seq")

Load libraries

suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(LSD))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(vsn))

Source some helper functions

source("~/Git/UPSCb/src/R/plot.multidensity.R")

Create a palette

pal <- brewer.pal(8,"Dark2")

Register the default plot margin

mar <- par("mar")

2 Raw data

2.1 Loading

Read the sample information

samples <- read.csv("~/Git/UPSCb/projects/arabidopsis-porcupine-RNA-Seq/doc/RNA-seq_pcp_vs_Col-0_info.csv")

Read the tx2gene translation

tx2gene <- read.table("/mnt/picea/storage/reference/Arabidopsis-thaliana/TAIR10/kallisto/tx2gene.txt",
                      header=TRUE)

tx2gene <- tx2gene[,2:1]

2.1.1 Re-sequencing data

We start with these as they have been split in multiple files

Read the re-sequencing data; list the files

rerun <- list.files("rerun/kallisto", 
                    recursive = TRUE, 
                    pattern = "abundance.tsv",
                    full.names = TRUE)

name them

names(rerun) <- sub("_sortmerna.*","",
                    sapply(strsplit(rerun, "/"), .subset, 3))

create a rerun sample file

samples.2 <- data.frame(ID=names(rerun),
                        Parent=sub("_.*","",names(rerun)),
                        stringsAsFactors=FALSE)

Read the expression at the transcript level

tx2 <- suppressMessages(tximport(files = rerun, type = "kallisto", 
                txOut = TRUE))
kt2 <- round(tx2$counts)

And summarize it at the gene level

kg2 <- round(summarizeToGene(txi=tx2,tx2gene=tx2gene)$counts)
## summarizing abundance
## summarizing counts
## summarizing length

2.1.2 Original data

orig <- list.files("firstrun/kallisto", 
                    recursive = TRUE, 
                    pattern = "abundance.tsv",
                    full.names = TRUE)

name them

names(orig) <- sub("_S[0-9]_L[0-9]{3}_sortmerna.*","",
                    sapply(strsplit(orig, "/"), .subset, 3))

Reorder the sample data.frame according to the way the results were read in and accounting for tech reps

samples <- samples[match(names(orig),samples$SampleID),]

Add some categorical variables

samples$Genotype=sub(" .*","",samples$Description)
samples$Temp=ifelse(sub(".* ","",samples$Description)=="16",
                    ifelse(nchar(as.character(samples$Description))>16,"WC","C"),
                    ifelse(nchar(as.character(samples$Description))>16,"CW","W"))

Read the expression at the transcript level

tx <- suppressMessages(tximport(files = orig, 
                                type = "kallisto", 
                                txOut = TRUE))
kt <- round(tx$counts)

And summarize it at the gene level

kg <- round(summarizeToGene(txi=tx,tx2gene=tx2gene)$counts)
## summarizing abundance
## summarizing counts
## summarizing length

2.1.3 Combined data

ct.2 <- do.call(cbind,
         lapply(split.data.frame(t(kt2),
                                 samples.2$Parent),
                colSums))
ct <- kt
ct[,match(c("13","14"),colnames(ct))] <- 
  ct[,match(c("13","14"),colnames(ct))] + 
  ct.2[,c("pcp-3D-23-1","pcp-3D-23-2")]

cg.2 <- do.call(cbind,
                lapply(split.data.frame(t(kg2),
                                        samples.2$Parent),
                       colSums))
cg <- kg
cg[,match(c("13","14"),colnames(cg))] <- 
  cg[,match(c("13","14"),colnames(cg))] + 
  cg.2[,c("pcp-3D-23-1","pcp-3D-23-2")]

2.2 Raw Data QC analysis

2.2.1 Re-sequencing data

Check how many genes are never expressed

sel <- rowSums(kg2) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(kg2),digits=1),
        sum(sel),
        nrow(kg2))
## [1] "30.3% percent (10165) of 33602 genes are not expressed"
sprintf("%s%% percent (%s) of %s transcripts are not expressed",
        round(sum(sel) * 100/ nrow(kt2),digits=1),
        sum(sel),
        nrow(kt2))
## [1] "24.4% percent (10165) of 41671 transcripts are not expressed"

Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

plot(density(log10(rowMeans(kg2))),col=pal[1],
     main="gene mean raw counts distribution",
     xlab="mean raw counts (log10)")

The cumulative transcript coverage is as expected

plot(density(log10(rowMeans(kt2))),col=pal[1],
     main="transcript mean raw counts distribution",
     xlab="mean raw counts (log10)")

The same is done for the individual samples colored by condition. The gene and transcript coverage look very similar. The split are all identical, aparts from the 2 with uneven depth (the splits with << 10M reads)

plot.multidensity(lapply(1:ncol(kg2),function(k){log10(kg2)[,k]}),
                  col=pal[as.integer(as.factor(samples.2$Parent))],
                  legend.x="topright",
                  legend=levels(as.factor(samples.2$Parent)),
                  legend.col=pal[1:nlevels(as.factor(samples.2$Parent))],
                  legend.lwd=2,
                  main="sample raw counts distribution",
                  xlab="per gene raw counts (log10)")

plot.multidensity(lapply(1:ncol(kt2),function(k){log10(kt2)[,k]}),
                  col=pal[as.integer(as.factor(samples.2$Parent))],
                  legend.x="topright",
                  legend=levels(as.factor(samples.2$Parent)),
                  legend.col=pal[1:nlevels(as.factor(samples.2$Parent))],
                  legend.lwd=2,
                  main="sample raw counts distribution",
                  xlab="per transcript raw counts (log10)")

2.2.2 Original data

Check how many genes are never expressed

sel <- rowSums(kg) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(kg),digits=1),
        sum(sel),
        nrow(kg))
## [1] "17.2% percent (5791) of 33602 genes are not expressed"
sprintf("%s%% percent (%s) of %s transcripts are not expressed",
        round(sum(sel) * 100/ nrow(kt),digits=1),
        sum(sel),
        nrow(kt))
## [1] "13.9% percent (5791) of 41671 transcripts are not expressed"

The cumulative gene coverage is as expected

plot(density(log10(rowMeans(kg))),col=pal[1],
     main="gene mean raw counts distribution",
     xlab="mean raw counts (log10)")

The cumulative transcript coverage is as expected

plot(density(log10(rowMeans(kt))),col=pal[1],
     main="transcript mean raw counts distribution",
     xlab="mean raw counts (log10)")

The same is done for the individual samples colored by condition. We oberved the expected bias from the 2 undersequenced smaples

plot.multidensity(lapply(1:ncol(kg),function(k){log10(kg)[,k]}),
                  col=pal[as.integer(samples$Description)],
                  legend.x="topright",
                  legend=levels(samples$Description),
                  legend.col=pal[1:nlevels(samples$Description)],
                  legend.lwd=2,
                  main="sample raw counts distribution",
                  xlab="per gene raw counts (log10)")

plot.multidensity(lapply(1:ncol(kt),function(k){log10(kt)[,k]}),
                  col=pal[as.integer(samples$Description)],
                  legend.x="topright",
                  legend=levels(samples$Description),
                  legend.col=pal[1:nlevels(samples$Description)],
                  legend.lwd=2,
                  main="sample raw counts distribution",
                  xlab="per transcript raw counts (log10)")

2.2.3 Combined data

Check how many genes are never expressed

sel <- rowSums(cg) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(cg),digits=1),
        sum(sel),
        nrow(cg))
## [1] "16.6% percent (5593) of 33602 genes are not expressed"
sprintf("%s%% percent (%s) of %s transcripts are not expressed",
        round(sum(sel) * 100/ nrow(ct),digits=1),
        sum(sel),
        nrow(ct))
## [1] "13.4% percent (5593) of 41671 transcripts are not expressed"

The cumulative gene coverage is as expected

plot(density(log10(rowMeans(cg))),col=pal[1],
     main="gene mean raw counts distribution",
     xlab="mean raw counts (log10)")

The cumulative transcript coverage is as expected

plot(density(log10(rowMeans(ct))),col=pal[1],
     main="transcript mean raw counts distribution",
     xlab="mean raw counts (log10)")

The same is done for the individual samples colored by condition. We oberved the expected bias from the 2 undersequenced smaples

plot.multidensity(lapply(1:ncol(cg),function(k){log10(cg)[,k]}),
                  col=pal[as.integer(samples$Description)],
                  legend.x="topright",
                  legend=levels(samples$Description),
                  legend.col=pal[1:nlevels(samples$Description)],
                  legend.lwd=2,
                  main="sample raw counts distribution",
                  xlab="per gene raw counts (log10)")

plot.multidensity(lapply(1:ncol(ct),function(k){log10(ct)[,k]}),
                  col=pal[as.integer(samples$Description)],
                  legend.x="topright",
                  legend=levels(samples$Description),
                  legend.col=pal[1:nlevels(samples$Description)],
                  legend.lwd=2,
                  main="sample raw counts distribution",
                  xlab="per transcript raw counts (log10)")

2.3 Raw data export

2.3.1 Combined only

dir.create(file.path("analysis","kallisto"),showWarnings=FALSE)
write.csv(cg,file="analysis/kallisto/combined-raw-unormalised-gene-expression_data.csv")
write.csv(ct,file="analysis/kallisto/combined-raw-unormalised-transcript-expression_data.csv")

3 Data normalisation

3.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate ### Re-sequencing data Create the dds object, without giving any prior on the design

conditions <- colnames(kg2)
dds.kg2 <- DESeqDataSetFromMatrix(
  countData = kg2,
  colData = data.frame(condition=conditions),
  design = ~ condition)
## converting counts to integer mode

Check the size factors (i.e. the sequencing library size effect)

dds.kg2 <- estimateSizeFactors(dds.kg2)
sizes.kg2 <- sizeFactors(dds.kg2)
names(sizes.kg2) <- colnames(kg2)
pander(sizes.kg2)
Table continues below
pcp-3D-23-1_S1_10 pcp-3D-23-1_S1_11 pcp-3D-23-1_S1_12 pcp-3D-23-1_S1_13
1.137 1.127 1.138 1.115
Table continues below
pcp-3D-23-1_S1_14 pcp-3D-23-1_S1_15 pcp-3D-23-1_S1_16 pcp-3D-23-1_S1_17
1.133 1.14 1.139 1.135
Table continues below
pcp-3D-23-1_S1_18 pcp-3D-23-1_S1_19 pcp-3D-23-1_S1_1 pcp-3D-23-1_S1_20
1.135 1.133 1.133 1.138
Table continues below
pcp-3D-23-1_S1_21 pcp-3D-23-1_S1_22 pcp-3D-23-1_S1_2 pcp-3D-23-1_S1_3
1.14 0.4358 1.128 1.132
Table continues below
pcp-3D-23-1_S1_4 pcp-3D-23-1_S1_5 pcp-3D-23-1_S1_6 pcp-3D-23-1_S1_7
1.139 1.134 1.134 1.133
Table continues below
pcp-3D-23-1_S1_8 pcp-3D-23-1_S1_9 pcp-3D-23-2_S2_10 pcp-3D-23-2_S2_11
1.135 1.134 1.031 1.029
Table continues below
pcp-3D-23-2_S2_12 pcp-3D-23-2_S2_13 pcp-3D-23-2_S2_14 pcp-3D-23-2_S2_15
1.034 1.02 1.03 1.037
Table continues below
pcp-3D-23-2_S2_16 pcp-3D-23-2_S2_17 pcp-3D-23-2_S2_18 pcp-3D-23-2_S2_19
1.037 1.032 1.035 1.035
Table continues below
pcp-3D-23-2_S2_1 pcp-3D-23-2_S2_20 pcp-3D-23-2_S2_21 pcp-3D-23-2_S2_2
1.033 1.038 0.1794 1.03
Table continues below
pcp-3D-23-2_S2_3 pcp-3D-23-2_S2_4 pcp-3D-23-2_S2_5 pcp-3D-23-2_S2_6
1.034 1.035 1.029 1.03
pcp-3D-23-2_S2_7 pcp-3D-23-2_S2_8 pcp-3D-23-2_S2_9
1.031 1.031 1.042
boxplot(sizes.kg2, main="Sequencing libraries size factor")

At the transcript level

dds.kt2 <- DESeqDataSetFromMatrix(
  countData = kt2,
  colData = data.frame(condition=conditions),
  design = ~ condition)
## converting counts to integer mode

Check the size factors (i.e. the sequencing library size effect)

dds.kt2 <- estimateSizeFactors(dds.kt2)
sizes.kt2 <- sizeFactors(dds.kt2)
names(sizes.kt2) <- colnames(kt2)
pander(sizes.kt2)
Table continues below
pcp-3D-23-1_S1_10 pcp-3D-23-1_S1_11 pcp-3D-23-1_S1_12 pcp-3D-23-1_S1_13
1.128 1.118 1.127 1.11
Table continues below
pcp-3D-23-1_S1_14 pcp-3D-23-1_S1_15 pcp-3D-23-1_S1_16 pcp-3D-23-1_S1_17
1.125 1.135 1.132 1.128
Table continues below
pcp-3D-23-1_S1_18 pcp-3D-23-1_S1_19 pcp-3D-23-1_S1_1 pcp-3D-23-1_S1_20
1.128 1.126 1.126 1.131
Table continues below
pcp-3D-23-1_S1_21 pcp-3D-23-1_S1_22 pcp-3D-23-1_S1_2 pcp-3D-23-1_S1_3
1.135 0.4342 1.121 1.122
Table continues below
pcp-3D-23-1_S1_4 pcp-3D-23-1_S1_5 pcp-3D-23-1_S1_6 pcp-3D-23-1_S1_7
1.132 1.13 1.122 1.126
Table continues below
pcp-3D-23-1_S1_8 pcp-3D-23-1_S1_9 pcp-3D-23-2_S2_10 pcp-3D-23-2_S2_11
1.128 1.13 1.041 1.037
Table continues below
pcp-3D-23-2_S2_12 pcp-3D-23-2_S2_13 pcp-3D-23-2_S2_14 pcp-3D-23-2_S2_15
1.042 1.03 1.041 1.047
Table continues below
pcp-3D-23-2_S2_16 pcp-3D-23-2_S2_17 pcp-3D-23-2_S2_18 pcp-3D-23-2_S2_19
1.045 1.042 1.046 1.044
Table continues below
pcp-3D-23-2_S2_1 pcp-3D-23-2_S2_20 pcp-3D-23-2_S2_21 pcp-3D-23-2_S2_2
1.041 1.05 0.1805 1.04
Table continues below
pcp-3D-23-2_S2_3 pcp-3D-23-2_S2_4 pcp-3D-23-2_S2_5 pcp-3D-23-2_S2_6
1.044 1.047 1.04 1.037
pcp-3D-23-2_S2_7 pcp-3D-23-2_S2_8 pcp-3D-23-2_S2_9
1.04 1.039 1.052
boxplot(sizes.kt2, main="Sequencing libraries size factor")

Just to compare the estimated sizes

plot(sizes.kt2,sizes.kg2,main="library sizes",xlab="transcript",ylab="gene")
abline(0,1)

3.1.1 Original data

Create the dds object, without giving any prior on the design

conditions <- colnames(kg)
dds.kg <- DESeqDataSetFromMatrix(
  countData = kg,
  colData = data.frame(condition=conditions),
  design = ~ condition)
## converting counts to integer mode

Check the size factors (i.e. the sequencing library size effect)

dds.kg <- estimateSizeFactors(dds.kg)
sizes.kg <- sizeFactors(dds.kg)
names(sizes.kg) <- colnames(kg)
pander(sizes.kg)
Table continues below
13 14 15 16 17 18 1 25 26 27 28 29 2
0.07284 0.2717 1.45 1.193 1.27 1.289 0.8732 1.35 1.393 1.247 1.286 1.241 1.062
30 37 38 39 3 40 41 42 4 5 6
1.218 1.249 1.081 1.094 1.057 1.882 1.21 1.01 1.411 1.064 0.8811
boxplot(sizes.kg, main="Sequencing libraries size factor")

At the transcript level

dds.kt <- DESeqDataSetFromMatrix(
  countData = kt,
  colData = data.frame(condition=conditions),
  design = ~ condition)
## converting counts to integer mode

Check the size factors (i.e. the sequencing library size effect)

dds.kt <- estimateSizeFactors(dds.kt)
sizes.kt <- sizeFactors(dds.kt)
names(sizes.kt) <- colnames(kt)
pander(sizes.kt)
Table continues below
13 14 15 16 17 18 1 25 26 27 28 29 2
0.0741 0.2732 1.453 1.195 1.272 1.29 0.8753 1.35 1.394 1.25 1.288 1.242 1.062
30 37 38 39 3 40 41 42 4 5 6
1.218 1.251 1.082 1.097 1.058 1.886 1.212 1.014 1.412 1.066 0.8826
boxplot(sizes.kt, main="Sequencing libraries size factor")

Just to compare the estimated sizes

plot(sizes.kt,sizes.kg,main="library sizes",xlab="transcript",ylab="gene")
abline(0,1)

3.1.2 Combined data

Create the dds object, without giving any prior on the design

conditions <- colnames(kg)
dds.g <- DESeqDataSetFromMatrix(
  countData = cg,
  colData = data.frame(condition=conditions),
  design = ~ condition)
## converting counts to integer mode

Check the size factors (i.e. the sequencing library size effect)

dds.g <- estimateSizeFactors(dds.g)
sizes.g <- sizeFactors(dds.g)
names(sizes.g) <- colnames(cg)
pander(sizes.g)
Table continues below
13 14 15 16 17 18 1 25 26 27 28 29 2
1.22 1.187 1.19 0.9814 1.046 1.061 0.7175 1.11 1.145 1.025 1.058 1.021 0.8738
30 37 38 39 3 40 41 42 4 5 6
1.002 1.028 0.8902 0.9011 0.8717 1.547 0.9951 0.8304 1.162 0.875 0.725
boxplot(sizes.g, main="Sequencing libraries size factor")

At the transcript level

dds.t <- DESeqDataSetFromMatrix(
  countData = ct,
  colData = data.frame(condition=conditions),
  design = ~ condition)
## converting counts to integer mode

Check the size factors (i.e. the sequencing library size effect)

dds.t <- estimateSizeFactors(dds.t)
sizes.t <- sizeFactors(dds.t)
names(sizes.t) <- colnames(ct)
pander(sizes.t)
Table continues below
13 14 15 16 17 18 1 25 26 27 28 29 2
1.211 1.176 1.195 0.986 1.052 1.067 0.7211 1.114 1.151 1.031 1.064 1.026 0.8798
30 37 38 39 3 40 41 42 4 5 6
1.007 1.034 0.8944 0.907 0.8739 1.558 1.001 0.8363 1.166 0.8769 0.7272
boxplot(sizes.t, main="Sequencing libraries size factor")

Just to compare the estimated sizes

plot(sizes.t,sizes.g,main="library sizes",xlab="transcript",ylab="gene")
abline(0,1)

3.2 Variance Stabilising Transformation

3.2.1 Re-sequencing data

At the gene level

vsd.kg2 <- varianceStabilizingTransformation(dds.kg2, blind=TRUE)
vst.kg2 <- assay(vsd.kg2)
colnames(vst.kg2) <- colnames(kg2)
vst.kg2 <- vst.kg2 - min(vst.kg2)

Validate the VST

meanSdPlot(vst.kg2[rowSums(kg2)>0,])

At the transcript level

vsd.kt2 <- varianceStabilizingTransformation(dds.kt2, blind=TRUE)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
vst.kt2 <- assay(vsd.kt2)
colnames(vst.kt2) <- colnames(kt2)
vst.kt2 <- vst.kt2 - min(vst.kt2)

Validate the VST

meanSdPlot(vst.kt2[rowSums(kt2)>0,])

3.2.2 Original data

At the gene level

vsd.kg <- varianceStabilizingTransformation(dds.kg, blind=TRUE)
vst.kg <- assay(vsd.kg)
colnames(vst.kg) <- colnames(kg)
vst.kg <- vst.kg - min(vst.kg)

Validate the VST

meanSdPlot(vst.kg[rowSums(kg)>0,])

At the transcript level

vsd.kt <- varianceStabilizingTransformation(dds.kt, blind=TRUE)
vst.kt <- assay(vsd.kt)
colnames(vst.kt) <- colnames(kt)
vst.kt <- vst.kt - min(vst.kt)

Validate the VST

meanSdPlot(vst.kt[rowSums(kt)>0,])

3.2.3 Combined data

At the gene level

vsd.g <- varianceStabilizingTransformation(dds.g, blind=TRUE)
vst.g <- assay(vsd.g)
colnames(vst.g) <- colnames(cg)
vst.g <- vst.g - min(vst.g)
write.csv(vst.g,"analysis/kallisto/combined-library-size-normalized_variance-stabilized_gene-expression_data.csv")

Validate the VST

meanSdPlot(vst.g[rowSums(cg)>0,])

At the transcript level

vsd.t <- varianceStabilizingTransformation(dds.t, blind=TRUE)
vst.t <- assay(vsd.t)
colnames(vst.t) <- colnames(ct)
vst.t <- vst.t - min(vst.t)
write.csv(vst.t,"analysis/kallisto/combined-library-size-normalized_variance-stabilized_transcript-expression_data.csv")

Validate the VST

meanSdPlot(vst.t[rowSums(ct)>0,])

4 QC on the normalised data

4.1 PCA

".pca" <- function(vst,fact,lgd="topleft"){
  pc <- prcomp(t(vst))
  
  percent <- round(summary(pc)$importance[2,]*100)
  
  #' ### 3 first dimensions
  mar=c(5.1,4.1,4.1,2.1)
  scatterplot3d(pc$x[,1],
                pc$x[,2],
                pc$x[,3],
                xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
                ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
                zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
                color=pal[as.integer(fact)],
                pch=19)
  legend(lgd,pch=19,
         col=pal[1:nlevels(fact)],
         legend=levels(fact))
  par(mar=mar)
}

4.1.1 Re-sequencing data

At the gene level

.pca(vst.kg2,factor(samples.2$Parent))

At the transcript level

.pca(vst.kt2,factor(samples.2$Parent))

4.1.2 Original data

At the gene level

.pca(vst.kg,factor(samples$Description),lgd="topright")

At the transcript level

.pca(vst.kt,factor(samples$Description),lgd="topright")

4.1.3 Combined data

At the gene level

.pca(vst.g,factor(samples$Description),lgd="topright")

At the transcript level

.pca(vst.t,factor(samples$Description),lgd="topright")

4.2 Heatmap

4.2.1 Re-sequencing data

1000 most variable genes labelled with genotype and temperature and showing per gene expression z-scores.

At the gene level

sel <- order(apply(vst.kg2,1,sd),decreasing=TRUE)[1:1000]
heatmap.2(vst.kg2[sel,],labRow = NA,trace = "none",cexCol = 0.6,scale = "row",
          labCol = samples.2$Parent)

At the transcript level

sel <- order(apply(vst.kt2,1,sd),decreasing=TRUE)[1:1000]
heatmap.2(vst.kt2[sel,],labRow = NA,trace = "none",cexCol = 0.6,scale = "row",
          labCol = samples.2$Parent)

4.2.2 Original data

At the gene level

sel <- order(apply(vst.kg,1,sd),decreasing=TRUE)[1:1000]
heatmap.2(vst.kg[sel,],labRow = NA,trace = "none",cexCol = 0.6,scale = "row",
          labCol = paste(samples$Genotype,samples$Temp))

At the transcript level

sel <- order(apply(vst.kt,1,sd),decreasing=TRUE)[1:1000]
heatmap.2(vst.kt[sel,],labRow = NA,trace = "none",cexCol = 0.6,scale = "row",
          labCol = paste(samples$Genotype,samples$Temp))

4.2.3 Combined data

At the gene level The classification is as expected from the HTSeq analysis

sel <- order(apply(vst.g,1,sd),decreasing=TRUE)[1:1000]
heatmap.2(vst.g[sel,],labRow = NA,trace = "none",cexCol = 0.6,scale = "row",
          labCol = paste(samples$Genotype,samples$Temp))

At the transcript level It is interesting to observe that some samples cluster differently, indicating (i) a possible technical bias in quantifying transcripts and/or (ii) a true natural variability

sel <- order(apply(vst.t,1,sd),decreasing=TRUE)[1:1000]
heatmap.2(vst.t[sel,],labRow = NA,trace = "none",cexCol = 0.6,scale = "row",
          labCol = paste(samples$Genotype,samples$Temp))

4.3 PCA (on combined only)

4.3.1 First two dimensions at the gene level

pc <- prcomp(t(vst.g))
percent <- round(summary(pc)$importance[2,]*100)

The first dimension separates the 2 “final” temperature, whereas the second dimension separates to some extend the genotypes. The temperature, or its shift, also affect that second dimension

plot(pc$x[,1],
     pc$x[,2],
     xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
     ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     col=pal[as.integer(samples$Description)],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")
legend("top",pch=19,
       col=pal[1:nlevels(samples$Description)],
       legend=levels(samples$Description))

Just to make the point clearer, plotting the temperature categorical data (C,W,CW,WC)

plot(pc$x[,1],
     pc$x[,2],
     xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
     ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     col=pal[as.integer(factor(samples$Temp))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("top",pch=19,
       col=pal[1:nlevels(factor(samples$Temp))],
       legend=levels(factor(samples$Temp)))

And the same for the temperature at collection

plot(pc$x[,1],
     pc$x[,2],
     xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
     ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     col=pal[as.integer(factor(samples$Temperature.at.collection))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("top",pch=19,
       col=pal[1:nlevels(factor(samples$Temperature.at.collection))],
       legend=levels(factor(samples$Temperature.at.collection)))

And the same for the age

plot(pc$x[,1],
     pc$x[,2],
     xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
     ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     col=pal[as.integer(factor(samples$Age.in.days))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("top",pch=19,
       col=pal[1:nlevels(factor(samples$Age.in.days))],
       legend=levels(factor(samples$Age.in.days)))

4.3.2 2nd and 3rd dims

Which to some extend separates the pcp from the Col-0 genotypes

plot(pc$x[,2],
     pc$x[,3],
     xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
     col=pal[as.integer(samples$Description)],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topleft",pch=19,
       col=pal[1:nlevels(samples$Description)],
       legend=levels(samples$Description))

And the same plotting the genotype

plot(pc$x[,2],
     pc$x[,3],
     xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
     col=pal[as.integer(factor(samples$Genotype))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topleft",pch=19,
       col=pal[1:nlevels(factor(samples$Genotype))],
       legend=levels(factor(samples$Genotype)))

And finally the same for the age

plot(pc$x[,2],
     pc$x[,3],
     xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
     col=pal[as.integer(factor(samples$Age.in.days))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topleft",pch=19,
       col=pal[1:nlevels(factor(samples$Age.in.days))],
       legend=levels(factor(samples$Age.in.days)))

4.3.3 First two dimensions at the transcript level

pc <- prcomp(t(vst.t))
percent <- round(summary(pc)$importance[2,]*100)

The first dimension separates the 2 “final” temperature, whereas the second dimension separates to some extend the genotypes. The temperature, or its shift, also affect that second dimension

plot(pc$x[,1],
     pc$x[,2],
     xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
     ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     col=pal[as.integer(samples$Description)],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")
legend("bottom",pch=19,
       col=pal[1:nlevels(samples$Description)],
       legend=levels(samples$Description))

Just to make the point clearer, plotting the temperature categorical data (C,W,CW,WC)

plot(pc$x[,1],
     pc$x[,2],
     xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
     ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     col=pal[as.integer(factor(samples$Temp))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("bottom",pch=19,
       col=pal[1:nlevels(factor(samples$Temp))],
       legend=levels(factor(samples$Temp)))

And the same for the temperature at collection

plot(pc$x[,1],
     pc$x[,2],
     xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
     ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     col=pal[as.integer(factor(samples$Temperature.at.collection))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("bottom",pch=19,
       col=pal[1:nlevels(factor(samples$Temperature.at.collection))],
       legend=levels(factor(samples$Temperature.at.collection)))

And the same for the age

plot(pc$x[,1],
     pc$x[,2],
     xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
     ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     col=pal[as.integer(factor(samples$Age.in.days))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("bottom",pch=19,
       col=pal[1:nlevels(factor(samples$Age.in.days))],
       legend=levels(factor(samples$Age.in.days)))

4.3.4 2nd and 3rd dims

Which to some extend separates the pcp from the Col-0 genotypes

plot(pc$x[,2],
     pc$x[,3],
     xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
     col=pal[as.integer(samples$Description)],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topleft",pch=19,
       col=pal[1:nlevels(samples$Description)],
       legend=levels(samples$Description))

And the same plotting the genotype

plot(pc$x[,2],
     pc$x[,3],
     xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
     col=pal[as.integer(factor(samples$Genotype))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topleft",pch=19,
       col=pal[1:nlevels(factor(samples$Genotype))],
       legend=levels(factor(samples$Genotype)))

And finally the same for the age

plot(pc$x[,2],
     pc$x[,3],
     xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
     col=pal[as.integer(factor(samples$Age.in.days))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topleft",pch=19,
       col=pal[1:nlevels(factor(samples$Age.in.days))],
       legend=levels(factor(samples$Age.in.days)))

4.4 HTseq vs. kallisto comparison (combined only)

htseq <- read.csv("analysis/HTSeq/library-size-normalized_variance-stabilized_data.csv",
                  row.names=1)
colnames(htseq) <- sub("X","",colnames(htseq))

klst <- vst.g[match(sub("\\.0","",rownames(htseq)),rownames(vst.g)),
              match(colnames(htseq),colnames(vst.g))]

lapply(1:ncol(htseq),function(i){
  heatscatter(htseq[,i],
              klst[,i],
              main=paste("sample",colnames(htseq)[i]),
              xlab="HTSeq VST counts",
                ylab="Kallisto VST counts")
  })

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## NULL
## 
## [[13]]
## NULL
## 
## [[14]]
## NULL
## 
## [[15]]
## NULL
## 
## [[16]]
## NULL
## 
## [[17]]
## NULL
## 
## [[18]]
## NULL
## 
## [[19]]
## NULL
## 
## [[20]]
## NULL
## 
## [[21]]
## NULL
## 
## [[22]]
## NULL
## 
## [[23]]
## NULL
## 
## [[24]]
## NULL

5 Conclusion

Using Kallisto at the gene or transcript levels gives identical results. Kallisto gives similar results to HTSeq but rescues a non insignificant number of genes, some of which are very highly expressed.

Next step is to perform the DE analysis on the kallisto genes and transcripts and compare these to the HTSeq results.

6 Session Info

## R version 3.3.1 (2016-06-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.1 LTS
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] hexbin_1.27.1              vsn_3.42.3                
##  [3] tximport_1.2.0             scatterplot3d_0.3-37      
##  [5] RColorBrewer_1.1-2         pander_0.6.0              
##  [7] LSD_3.0                    gplots_3.0.1              
##  [9] DESeq2_1.14.0              SummarizedExperiment_1.4.0
## [11] Biobase_2.34.0             GenomicRanges_1.26.1      
## [13] GenomeInfoDb_1.10.1        IRanges_2.8.1             
## [15] S4Vectors_0.12.0           BiocGenerics_0.20.0       
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.8           locfit_1.5-9.1        lattice_0.20-34      
##  [4] gtools_3.5.0          assertthat_0.1        rprojroot_1.1        
##  [7] digest_0.6.10         plyr_1.8.4            chron_2.3-47         
## [10] backports_1.0.4       acepack_1.4.1         RSQLite_1.0.0        
## [13] evaluate_0.10         BiocInstaller_1.24.0  ggplot2_2.2.0        
## [16] zlibbioc_1.20.0       lazyeval_0.2.0        data.table_1.9.6     
## [19] annotate_1.52.0       gdata_2.17.0          rpart_4.1-10         
## [22] Matrix_1.2-7.1        preprocessCore_1.36.0 rmarkdown_1.2        
## [25] labeling_0.3          splines_3.3.1         BiocParallel_1.8.1   
## [28] geneplotter_1.52.0    stringr_1.1.0         foreign_0.8-67       
## [31] RCurl_1.95-4.8        munsell_0.4.3         htmltools_0.3.5      
## [34] nnet_7.3-12           tibble_1.2            gridExtra_2.2.1      
## [37] htmlTable_1.7         Hmisc_4.0-0           XML_3.98-1.5         
## [40] bitops_1.0-6          grid_3.3.1            affy_1.52.0          
## [43] xtable_1.8-2          gtable_0.2.0          DBI_0.5-1            
## [46] magrittr_1.5          scales_0.4.1          KernSmooth_2.23-15   
## [49] stringi_1.1.2         XVector_0.14.0        genefilter_1.56.0    
## [52] affyio_1.44.0         limma_3.30.4          latticeExtra_0.6-28  
## [55] Formula_1.2-1         tools_3.3.1           survival_2.40-1      
## [58] yaml_2.1.14           AnnotationDbi_1.36.0  colorspace_1.3-1     
## [61] cluster_2.0.5         caTools_1.17.1        knitr_1.15.1